{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Using Null\n",
    "\n",
    "- teacher\n",
    "\n",
    "id\t| dept\t| name\t| phone\t| mobile\n",
    "----|-------|-------|-------|-----\n",
    "101\t| 1 | Shrivell\t| 2753 | 07986 555 1234\n",
    "102\t| 1\t| Throd\t    | 2754 | 07122 555 1920\n",
    "103\t| 1\t| Splint\t| 2293\t|\n",
    "104 |\t| Spiregrain | 3287\t|\n",
    "105 | 2\t| Cutflower\t | 3212 | 07996 555 6574\n",
    "106 |\t| Deadyawn | 3345 |\t\n",
    "... |      |        |        |\n",
    "\n",
    "- dept\n",
    "\n",
    "id\t| name\n",
    "----|----\n",
    "1\t| Computing\n",
    "2\t| Design\n",
    "3\t| Engineering\n",
    "... |\n",
    "\n",
    "### Teachers and Departments\n",
    "The school includes many departments. Most teachers work exclusively for a single department. Some teachers have no department.\n",
    "\n",
    "[Selecting NULL values](https://sqlzoo.net/wiki/Selecting_NULL_values)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading spark-stubs, spark-hive\n",
      "Adding Hive conf dir /opt/hive/conf to classpath\n",
      "Creating SparkSession\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "SLF4J: No SLF4J providers were found.\n",
      "SLF4J: Defaulting to no-operation (NOP) logger implementation\n",
      "SLF4J: See https://www.slf4j.org/codes.html#noProviders for further details.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<a target=\"_blank\" href=\"http://jupyter:4040\">Spark UI</a>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\u001b[32mimport \u001b[39m\u001b[36m$ivy.$                                  \n",
       "\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.log4j.{Level, Logger}\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark._\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql._\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql.functions._\n",
       "\n",
       "\u001b[39m\n",
       "\u001b[36mspark\u001b[39m: \u001b[32mSparkSession\u001b[39m = org.apache.spark.sql.SparkSession@73be95f9"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import $ivy.`org.apache.spark::spark-sql:3.4.0`\n",
    "\n",
    "import org.apache.log4j.{Level, Logger}\n",
    "Logger.getLogger(\"org\").setLevel(Level.OFF)\n",
    "\n",
    "import org.apache.spark._\n",
    "import org.apache.spark.sql._\n",
    "import org.apache.spark.sql.functions._\n",
    "\n",
    "val spark = {\n",
    "    NotebookSparkSession.builder()\n",
    "    .progress(false)\n",
    "    .appName(\"app08\")\n",
    "    // .master(\"spark://192.168.31.31:7077\")\n",
    "    .master(\"local[*]\")\n",
    "    .config(\"spark.sql.warehouse.dir\", \n",
    "            \"hdfs://192.168.31.31:9000/user/hive/warehouse\") \n",
    "    .config(\"spark.cores.max\", \"4\") \n",
    "    .config(\"spark.executor.instances\", \"1\") \n",
    "    .config(\"spark.executor.cores\", \"2\") \n",
    "    .config(\"spark.executor.memory\", \"10g\") \n",
    "    .config(\"spark.shuffle.service.enabled\", \"false\") \n",
    "    .config(\"spark.dynamicAllocation.enabled\", \"false\") \n",
    "    .config(\"spark.sql.catalogImplementation\", \"hive\")\n",
    "    .config(\"spark.sql.repl.eagerEval.enabled\", \"true\")\n",
    "    .config(\"spark.driver.allowMultipleContexts\", \"true\")\n",
    "    .getOrCreate()\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\u001b[32mimport \u001b[39m\u001b[36mspark.implicits._\n",
       "\u001b[39m\n",
       "defined \u001b[32mfunction\u001b[39m \u001b[36msc\u001b[39m\n",
       "\u001b[36mhiveCxt\u001b[39m: \u001b[32msql\u001b[39m.\u001b[32mhive\u001b[39m.\u001b[32mHiveContext\u001b[39m = org.apache.spark.sql.hive.HiveContext@2b4fd56e"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import spark.implicits._\n",
    "def sc = spark.sparkContext\n",
    "val hiveCxt = new org.apache.spark.sql.hive.HiveContext(sc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defined \u001b[32mclass\u001b[39m \u001b[36mRichDF\u001b[39m"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// Credit to Aivean\n",
    "implicit class RichDF(val ds:DataFrame) {\n",
    "    def showHTML(limit: Int = 50, truncate: Int = 100) = {\n",
    "        import xml.Utility.escape\n",
    "        val data = ds.take(limit)\n",
    "        val header = ds.schema.fieldNames.toSeq        \n",
    "        val rows: Seq[Seq[String]] = data.map { row =>\n",
    "          row.toSeq.map {cell =>\n",
    "            val str = cell match {\n",
    "              case null => \"null\"\n",
    "              case binary: Array[Byte] => binary.map(\"%02X\".format(_)).mkString(\"[\", \" \", \"]\")\n",
    "              case array: Array[_] => array.mkString(\"[\", \", \", \"]\")\n",
    "              case seq: Seq[_] => seq.mkString(\"[\", \", \", \"]\")\n",
    "              case _ => cell.toString\n",
    "            }\n",
    "            if (truncate > 0 && str.length > truncate) {\n",
    "              // do not show ellipses for strings shorter than 4 characters.\n",
    "              if (truncate < 4) str.substring(0, truncate)\n",
    "              else str.substring(0, truncate - 3) + \"...\"\n",
    "            } else {\n",
    "              str\n",
    "            }\n",
    "          }: Seq[String]\n",
    "        }\n",
    "    publish.html(s\"\"\" <table>\n",
    "                <tr>\n",
    "                 ${header.map(h => s\"<th>${escape(h)}</th>\").mkString}\n",
    "                </tr>\n",
    "                ${rows.map {row =>\n",
    "                  s\"<tr>${row.map{c => s\"<td>${escape(c)}</td>\" }.mkString}</tr>\"\n",
    "                }.mkString}\n",
    "            </table>\n",
    "        \"\"\")\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\u001b[36mteacher\u001b[39m: \u001b[32mDataFrame\u001b[39m = [id: int, dept: int ... 3 more fields]\n",
       "\u001b[36mdept\u001b[39m: \u001b[32mDataFrame\u001b[39m = [id: int, name: string]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "val teacher = hiveCxt.table(\"sqlzoo.teacher\")\n",
    "val dept = hiveCxt.table(\"sqlzoo.dept\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. NULL, INNER JOIN, LEFT JOIN, RIGHT JOIN\n",
    "\n",
    "List the teachers who have NULL for their department.\n",
    "\n",
    "> _Why we cannot use =_   \n",
    "> You might think that the phrase dept=NULL would work here but it doesn't - you can use the phrase dept IS NULL\n",
    "> \n",
    "> _That's not a proper explanation._  \n",
    "> No it's not, but you can read a better explanation at Wikipedia:NULL."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>name</th>\n",
       "                </tr>\n",
       "                <tr><td>Spiregrain</td></tr><tr><td>Deadyawn</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(teacher.filter(isnull(teacher(\"dept\")))\n",
    " .select(\"name\").showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.\n",
    "Note the INNER JOIN misses the teachers with no department and the departments with no teacher."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>teacher</th><th>name</th>\n",
       "                </tr>\n",
       "                <tr><td>Shrivell</td><td>Computing</td></tr><tr><td>Throd</td><td>Computing</td></tr><tr><td>Splint</td><td>Computing</td></tr><tr><td>Cutflower</td><td>Design</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(teacher.withColumnRenamed(\"name\", \"teacher\")\n",
    "     .join(dept, teacher(\"dept\")===dept(\"id\"))\n",
    "     .select(\"teacher\", \"name\")\n",
    "     .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3.\n",
    "Use a different JOIN so that all teachers are listed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>teacher</th><th>name</th>\n",
       "                </tr>\n",
       "                <tr><td>Shrivell</td><td>Computing</td></tr><tr><td>Throd</td><td>Computing</td></tr><tr><td>Splint</td><td>Computing</td></tr><tr><td>Spiregrain</td><td>null</td></tr><tr><td>Cutflower</td><td>Design</td></tr><tr><td>Deadyawn</td><td>null</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(teacher.withColumnRenamed(\"name\", \"teacher\")\n",
    "    .join(dept, teacher(\"dept\")===dept(\"id\"), joinType=\"left\")\n",
    "    .select(\"teacher\", \"name\")\n",
    "    .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.\n",
    "Use a different JOIN so that all departments are listed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>teacher</th><th>name</th>\n",
       "                </tr>\n",
       "                <tr><td>Splint</td><td>Computing</td></tr><tr><td>Throd</td><td>Computing</td></tr><tr><td>Shrivell</td><td>Computing</td></tr><tr><td>Cutflower</td><td>Design</td></tr><tr><td>null</td><td>Engineering</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(teacher.withColumnRenamed(\"name\", \"teacher\")\n",
    "    .join(dept, teacher(\"dept\")===dept(\"id\"), joinType=\"right\")\n",
    "    .select(\"teacher\", \"name\")\n",
    "    .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Using the [COALESCE](https://sqlzoo.net/wiki/COALESCE) function\n",
    "\n",
    "\n",
    "Use COALESCE to print the mobile number. Use the number '07986 444 2266' if there is no number given. **Show teacher name and mobile number or '07986 444 2266'**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>name</th><th>mobile</th>\n",
       "                </tr>\n",
       "                <tr><td>Shrivell</td><td>07986 555 1234</td></tr><tr><td>Throd</td><td>07122 555 1920</td></tr><tr><td>Splint</td><td>07986 444 2266</td></tr><tr><td>Spiregrain</td><td>07986 444 2266</td></tr><tr><td>Cutflower</td><td>07996 555 6574</td></tr><tr><td>Deadyawn</td><td>07986 444 2266</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(teacher.select(\"name\", \"mobile\")\n",
    " .na.fill(\"07986 444 2266\", Array(\"mobile\"))\n",
    " .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.\n",
    "Use the COALESCE function and a LEFT JOIN to print the teacher name and department name. Use the string 'None' where there is no department."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>teacher</th><th>name</th>\n",
       "                </tr>\n",
       "                <tr><td>Shrivell</td><td>Computing</td></tr><tr><td>Throd</td><td>Computing</td></tr><tr><td>Splint</td><td>Computing</td></tr><tr><td>Spiregrain</td><td>None</td></tr><tr><td>Cutflower</td><td>Design</td></tr><tr><td>Deadyawn</td><td>None</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(teacher.withColumnRenamed(\"name\", \"teacher\")\n",
    "    .join(dept, teacher(\"dept\")===dept(\"id\"), joinType=\"left\")\n",
    "    .select(\"teacher\", \"name\")\n",
    "    .na.fill(\"None\", Array(\"name\"))\n",
    "    .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7.\n",
    "Use COUNT to show the number of teachers and the number of mobile phones."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>count(name)</th><th>count(mobile)</th>\n",
       "                </tr>\n",
       "                <tr><td>6</td><td>3</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "teacher.agg(count(\"name\"), count(\"mobile\")).showHTML()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8.\n",
    "Use COUNT and GROUP BY **dept.name** to show each department and the number of staff. Use a RIGHT JOIN to ensure that the Engineering department is listed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>name</th><th>count(teacher)</th>\n",
       "                </tr>\n",
       "                <tr><td>Engineering</td><td>0</td></tr><tr><td>Computing</td><td>3</td></tr><tr><td>Design</td><td>1</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(teacher.withColumnRenamed(\"name\", \"teacher\")\n",
    " .join(dept, teacher(\"dept\")===dept(\"id\"), joinType=\"right\")\n",
    " .groupBy(\"name\")\n",
    " .agg(count(\"teacher\"))\n",
    " .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9. Using [CASE](https://sqlzoo.net/wiki/CASE)\n",
    "\n",
    "\n",
    "Use CASE to show the **name** of each teacher followed by 'Sci' if the teacher is in **dept** 1 or 2 and 'Art' otherwise."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>name</th><th>dept</th><th>label</th>\n",
       "                </tr>\n",
       "                <tr><td>Shrivell</td><td>1</td><td>Sci</td></tr><tr><td>Throd</td><td>1</td><td>Sci</td></tr><tr><td>Splint</td><td>1</td><td>Sci</td></tr><tr><td>Spiregrain</td><td>null</td><td>Art</td></tr><tr><td>Cutflower</td><td>2</td><td>Art</td></tr><tr><td>Deadyawn</td><td>null</td><td>Art</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(teacher\n",
    " .select($\"name\", $\"dept\", when($\"dept\".isin(List(0, 1): _*), \"Sci\")\n",
    "         .otherwise(\"Art\").alias(\"label\"))\n",
    " .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 10.\n",
    "Use CASE to show the name of each teacher followed by 'Sci' if the teacher is in dept 1 or 2, show 'Art' if the teacher's dept is 3 and 'None' otherwise."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>name</th><th>dept</th><th>label</th>\n",
       "                </tr>\n",
       "                <tr><td>Shrivell</td><td>1</td><td>Sci</td></tr><tr><td>Throd</td><td>1</td><td>Sci</td></tr><tr><td>Splint</td><td>1</td><td>Sci</td></tr><tr><td>Spiregrain</td><td>null</td><td>None</td></tr><tr><td>Cutflower</td><td>2</td><td>Sci</td></tr><tr><td>Deadyawn</td><td>null</td><td>None</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(teacher\n",
    " .select($\"name\", $\"dept\", when($\"dept\".isin(List(1, 2): _*), \"Sci\")\n",
    "         .when($\"dept\".isin(List(3): _*), \"Art\")\n",
    "         .otherwise(\"None\").alias(\"label\"))\n",
    " .showHTML())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.stop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Scala",
   "language": "scala",
   "name": "scala"
  },
  "language_info": {
   "codemirror_mode": "text/x-scala",
   "file_extension": ".sc",
   "mimetype": "text/x-scala",
   "name": "scala",
   "nbconvert_exporter": "script",
   "version": "2.12.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
